In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sb
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
In [2]:
import os
print(os.listdir("C:\\Users\\ajaohri\\Desktop\\all"))
# Any results you write to the current directory are saved as output.
In [3]:
url_train = 'C:\\Users\\ajaohri\\Desktop\\all/train.csv'
titanic = pd.read_csv(url_train)
titanic.head()
Out[3]:
In [4]:
#Checking if our target variable is binary or not
sb.countplot(x='Survived',data=titanic)
Out[4]:
In [5]:
#Checking Null values
titanic.isnull().sum()
Out[5]:
Dropping PassengerId, Name and Ticket because they are unique. Dropping Cabin because of too many null values.
In [6]:
titanic_data = titanic.drop(['PassengerId','Name','Ticket'],1)
titanic_data.head()
Out[6]:
Now need to take care of the missing data for Age variable. Need to approximate- one way, to take mean age for all the missing values. Or, find if age is related to Pclass, and assign respective means.
In [7]:
sb.boxplot(x='Pclass',y='Age',data=titanic_data)
Out[7]:
If Passenger belongs to Pclass 3, age assigned is 24, if 2, age is assigned 29, if 1 then 37.
In [8]:
def age_approx(cols):
age = cols[0]
pclass = cols[1]
if pd.isnull(age):
if pclass == 1:
return 37
elif pclass == 2:
return 29
else:
return 24
else:
return age
In [9]:
titanic_data['Age'] = titanic_data[['Age', 'Pclass']].apply(age_approx, axis=1)
titanic_data.isnull().sum()
Out[9]:
In [10]:
def cabin_approx(cols):
cabin = cols[0]
pclass = cols[1]
if pd.isnull(cabin):
return 0
elif cabin[0] == ('C' or 'B'):
return 3
elif cabin[0] == ('A' or 'D' or 'E' or 'T'):
return 2
elif cabin[0] == ('F' or 'G'):
return 1
else:
return 0
In [11]:
titanic_data['Cabin'] = titanic_data[['Cabin', 'Pclass']].apply(cabin_approx, axis=1)
#titanic_data.isnull().sum()
sb.boxplot(x='Cabin',y='Fare',data=titanic_data)
Out[11]:
There are two null values in Embarked, we can just drop them.
In [12]:
titanic_data.dropna(inplace=True)
titanic_data.isnull().sum()
Out[12]:
Getting dummy variables from categorical ones.
In [13]:
gender = pd.get_dummies(titanic_data['Sex'],drop_first=True)
gender.head()
Out[13]:
In [14]:
embark_location = pd.get_dummies(titanic_data['Embarked'],drop_first=True)
embark_location.head()
Out[14]:
In [15]:
titanic_data.drop(['Sex','Embarked'],axis=1,inplace=True)
titanic_data.head()
Out[15]:
In [16]:
titanic_dmy = pd.concat([titanic_data, gender, embark_location],axis=1)
titanic_dmy.tail()
Out[16]:
In [17]:
#Checking for correlation between variables.
sb.heatmap(titanic_dmy.corr(),square=True)
#print(titanic_dmy.corr())
Out[17]:
In [18]:
X = titanic_dmy.ix[:,(1,2,3,4,5,6,7,8,9)].values
y = titanic_dmy.ix[:,0].values
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1, random_state=2)
The train test split is done for parameter tuning. We now deploy the models.
In [20]:
!pip install xgboost
In [24]:
from sklearn.ensemble import RandomForestClassifier
#from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import VotingClassifier
clf1 = SVC(kernel='linear',C=1.0,random_state=3)
clf2 = XGBClassifier(random_state=3)
clf3 = RandomForestClassifier(n_estimators=30, max_depth=10, random_state=300)
eclf = VotingClassifier(estimators=[('clf1', clf1), ('clf2', clf2),('clf3',clf3)], voting='hard')
eclf.fit(X_train, y_train)
y_pred = eclf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(eclf.score(X_test, y_test))
Now taking in Competition Data.
In [25]:
url = 'C:\\Users\\ajaohri\\Desktop\\all/test.csv'
test = pd.read_csv(url)
test.head()
Out[25]:
In [26]:
test.isnull().sum()
Out[26]:
There are 86 null values in Age, so we approximate them like we did earlier. There are 327 null values in Cabin, so we drop it altogether. There is 1 null value in Fare, so we approximate it according to the median of each class of the null position.
In [27]:
test.describe()
Out[27]:
In [28]:
sb.set(rc={'figure.figsize':(11.7,8.27)})
ax = sb.boxplot(x='Pclass',y='Fare',data=test,width=0.9)
In [29]:
def fare_approx(cols):
fare = cols[0]
pclass = cols[1]
if pd.isnull(fare):
if pclass == 1:
return 55
elif pclass == 2:
return 20
else:
return 10
else:
return fare
Cleaning up the test data: Dropping variables, approximating age and fare, dummy variables.
In [30]:
test_data = test.drop(['Name','Ticket'],1)
test_data['Age'] = test_data[['Age', 'Pclass']].apply(age_approx, axis=1)
test_data['Fare'] = test_data[['Fare','Pclass']].apply(fare_approx, axis=1)
test_data['Cabin'] = test_data[['Cabin','Pclass']].apply(cabin_approx, axis=1)
#
gender_test = pd.get_dummies(test_data['Sex'],drop_first=True)
embark_location_test = pd.get_dummies(test_data['Embarked'],drop_first=True)
test_data.drop(['Sex','Embarked'],axis=1,inplace=True)
test_dmy = pd.concat([test_data, gender_test, embark_location_test],axis=1)
#test_dmy.describe()
test_data.dropna(inplace=True)
test_dmy.isnull().sum()
Out[30]:
In [31]:
test_dmy.head()
Out[31]:
In [32]:
X_competition = test_dmy.ix[:,(1,2,3,4,5,6,7,8,9)].values
Prediction for Competition Data
In [33]:
y_comp = eclf.predict(X_competition)
In [34]:
submission = pd.DataFrame({'PassengerId':test_data['PassengerId'],'Survived':y_comp})
submission.head()
Out[34]:
In [35]:
filename = 'Titanic Predictions 1.csv'
submission.to_csv(filename,index=False)
print('Saved file: ' + filename)
In [38]:
os.getcwd()
Out[38]:
In [ ]: